load packages
library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)
load coded data
coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/tds_artifact_coded_volumes.csv")
load stripe detection data
fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/"
filePattern = "tds_stripes_.*.csv"
file_list = list.files(fileDir, pattern = filePattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("stripes")){
temp = read.csv(paste0(fileDir,file))
stripes = data.frame(temp) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.csv(paste0(fileDir,file))
temp_dataset = data.frame(temp_dataset) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
stripes = rbind(stripes, temp_dataset)
rm(temp_dataset)
}
}
load global intensities and rps
# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/rp_txt/'
outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
study = "tds2"
rpPattern = "^rp_([0-9]{3})_(.*).txt"
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")
# global intensities
intensities = read.csv(paste0(outputDir,study,'_globalIntensities.csv'))
# edit volume numbers for subject 157, stop3
intensities = intensities %>%
mutate(volume = ifelse(subjectID == 157 & run == "stop3" & volume > 43, volume - 1, volume))
# rp files
file_list = list.files(rpDir, pattern = rpPattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("rp")){
temp = read.table(paste0(rpDir,file))
colnames(temp) = rpCols
rp = data.frame(temp, file = rep(file,count(temp))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.table(paste0(rpDir,file))
colnames(temp_dataset) = rpCols
temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rp = rbind(rp, temp_dataset)
rm(temp_dataset)
}
}
join dataframes
joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
left_join(., rp, by = c("subjectID", "run", "volume")) %>%
mutate(striping = ifelse(is.na(striping), 0, striping),
intensity = ifelse(is.na(intensity), 0, intensity),
tile = paste0("tile_",tile),
artifact = ifelse(striping > 1, 1, 0),
confidence = ifelse(striping == 1, "maybe", "sure"),
confidence = as.factor(confidence)) %>%
group_by(subjectID, run, tile) %>%
mutate(Diff.mean = volMean - lag(volMean),
Diff.sd = volSD - lag(volSD)) %>%
spread(tile, freqtile_power)
load models
setwd("~/Documents/code/dsnlab/automotion-test-set")
model_log = readRDS("model_log_FP.rds")
model_svm = readRDS("model_svm_FP.rds")
split the data
set.seed(101)
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)
machine learning
tidy data
ml = joined %>%
group_by(subjectID, run) %>%
mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
gather(tile,freqtile_power, starts_with("tile")) %>%
mutate(tile = paste0(tile,"_c")) %>%
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
select(-fsl.volume, -striping, -intensity, -confidence, -trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
select(subjectID, run, volume, artifact, everything())
logistic regression
x = as.matrix(ml[,-c(1,2,3,4)])
y = as.double(as.matrix(ml[, 4]))
pred = predict(model_log, newx = x, s=model_log$lambda.1se, type="response")
pred[pred > .05] = 1
pred[pred < .05] = 0
svm
full_svm = ml %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# re-run svm on full sample
full_pred = predict(model_svm, newdata = full_svm, type="prob") %>%
select(-no)
full_pred = as.matrix(full_pred)
full_pred[full_pred > .09] = "yes"
full_pred[full_pred < .09] = "no"
auto-motion process
man = joined %>%
gather(tile, freqtile_power, starts_with("tile")) %>%
filter(tile %in% c("tile_1", "tile_10")) %>%
# code trash based on mean, sd, and rp
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
trash.mean = ifelse(is.na(Diff.mean),0,trash.mean),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
trash.sd = ifelse(is.na(Diff.sd),0,trash.sd),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd + trash.stripe,
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
ifelse(trash.combined == 0 & (artifact == 1), "neg",
ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c("tile_1", "tile_10"))
confusion matrices
logistic regression
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7419 43
## 1 124 116
##
## Accuracy : 0.9783
## 95% CI : (0.9748, 0.9815)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.7543
##
## Kappa : 0.5708
## Mcnemar's Test P-Value : 0.0000000005994
##
## Sensitivity : 0.9836
## Specificity : 0.7296
## Pos Pred Value : 0.9942
## Neg Pred Value : 0.4833
## Prevalence : 0.9794
## Detection Rate : 0.9633
## Detection Prevalence : 0.9688
## Balanced Accuracy : 0.8566
##
## 'Positive' Class : 0
##
svm
confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7462 55
## yes 81 104
##
## Accuracy : 0.9823
## 95% CI : (0.9791, 0.9852)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.03331
##
## Kappa : 0.5957
## Mcnemar's Test P-Value : 0.03205
##
## Sensitivity : 0.9893
## Specificity : 0.6541
## Pos Pred Value : 0.9927
## Neg Pred Value : 0.5622
## Prevalence : 0.9794
## Detection Rate : 0.9688
## Detection Prevalence : 0.9760
## Balanced Accuracy : 0.8217
##
## 'Positive' Class : no
##
manual
man1 = man %>%
filter(tile == "tile_1")
confusionMatrix(man1$trash.combined, man1$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7481 30
## 1 62 129
##
## Accuracy : 0.9881
## 95% CI : (0.9854, 0.9904)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.000000004137
##
## Kappa : 0.7311
## Mcnemar's Test P-Value : 0.001229
##
## Sensitivity : 0.9918
## Specificity : 0.8113
## Pos Pred Value : 0.9960
## Neg Pred Value : 0.6754
## Prevalence : 0.9794
## Detection Rate : 0.9713
## Detection Prevalence : 0.9752
## Balanced Accuracy : 0.9016
##
## 'Positive' Class : 0
##
plot and compare
join and plot models
# logistic regression
data.plot.log = bind_cols(ml, as.data.frame(y), as.data.frame(pred)) %>%
mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
ifelse(y == 0 & `1` == 1, "pos",
ifelse(y == 1 & `1` == 0, "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
# svm
data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
rename("full_pred" = yes) %>%
mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
ifelse(artifact == "no" & full_pred == "yes", "pos",
ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
# plot and compare models
plot.comp = man %>%
filter(tile == "tile_1") %>%
select(subjectID, run, volume, euclidian_trans_deriv, hits, confidence, intensity) %>%
rename("auto" = hits) %>%
left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, hits, confidence, intensity) %>%
rename("log" = hits) %>%
left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits, confidence, intensity) %>%
rename("svm" = hits) %>%
gather(model, hits, c("auto", "log", "svm")) %>%
mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''),
intensity = as.factor(intensity))
ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
geom_line(size = .25) +
geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits, alpha = confidence, shape = intensity), size = 2.5) +
geom_text(aes(label = label), size = 1.5) +
facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
scale_alpha_manual(values = c(.25, 1)) +
scale_shape_manual(values = c(16, 17, 15), labels = c("no", "maybe", "yes")) +
theme(axis.text.x = element_text(size = 6))
![]()